Using devices such as Jawbone Up, Nike FuelBand, and Fitbit it is now possible to collect a large amount of data about personal activity relatively inexpensively. These type of devices are part of the quantified self movement - a group of enthusiasts who take measurements about themselves regularly to improve their health, to find patterns in their behavior, or because they are tech geeks. One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it. In this project, our goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways. More information is available from the website here:
<http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har>
The training data for this project are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
The test data are available here:
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
The data for this project come from this source: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har.
The outcome variable is classe, a factor variable with 5 levels. For this data set, participants were asked to perform one set of 10 repetitions of the Unilateral Dumbbell Biceps Curl in 5 different fashions:
Classe A — exactly according to the specification Classe B — throwing the elbows to the front Classe C — lifting the dumbbell only halfway Classe D — lowering the dumbbell only halfway Classe E — throwing the hips to the front
Librarys
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.0.3
## == Using correlationfunnel? ======================================================================
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
train <- read.csv("Input/pml-training.csv")
test <- read.csv("Input/pml-testing.csv")
train <- train %>% select(which(colMeans(is.na(.)) < 0.95))
test <- test %>% select(which(colMeans(is.na(.)) < 0.95))
nam <- c("kurtosis", "skewness", "max", "min", "amplitude", "timestamp","window")
for (i in 1:7){ train <- train[,-grep(nam[i],names(train))]}
nam2 <- c("timestamp","window")
for (i in 1:2){ test <- test[,-grep(nam2[i],names(test))]}
X_test <- test[,3:55]
cv_tr <- createDataPartition(train$classe, p=0.65, list=FALSE)
cvtrain <- train[cv_tr,]
x_cvtrain <- cvtrain[,3:55]
cvtest <- train[-cv_tr,]
x_test <- cvtest[,3:54]
y_test <- cvtest[,55]
x_tr_corr <- cor(x_cvtrain[,1:52])
diag(x_tr_corr) <- 0
x_tr_corr <- which(abs(x_tr_corr)>0.8,arr.ind = T)
x_tr_corr_un <-unique(row.names(x_tr_corr))
corrplot(cor(select(x_cvtrain,x_tr_corr_un)),type="upper",
order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(x_tr_corr_un)` instead of `x_tr_corr_un` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# binarizing data
corr_funl_cl <- x_cvtrain %>% binarize(n_bins = 4, thresh_infreq = 0.01)
corr_a <- corr_funl_cl %>% correlate(target = classe__A)
corr_a %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
corr_a_sub <- head(corr_a %>% mutate(corr = abs(correlation)) %>%
arrange(desc(corr)) %>% select(feature) %>% unique(),20)
corr_a_sub$feature[which(corr_a_sub$feature %in% x_tr_corr_un)]
## [1] magnet_arm_x accel_arm_x gyros_arm_y gyros_arm_x
## [5] gyros_forearm_y magnet_arm_y accel_dumbbell_z
## 53 Levels: accel_belt_y yaw_forearm accel_belt_x ... classe
corr_b <- corr_funl_cl %>% correlate(target = classe__B)
corr_b %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
corr_b_sub <- head(corr_b %>% mutate(corr = abs(correlation)) %>%
arrange(desc(corr)) %>% select(feature) %>% unique(),20)
corr_b_sub$feature[which(corr_b_sub$feature %in% x_tr_corr_un)]
## [1] accel_dumbbell_z yaw_dumbbell accel_dumbbell_x yaw_belt
## [5] pitch_dumbbell magnet_arm_z accel_belt_z magnet_arm_y
## 53 Levels: accel_belt_x accel_arm_y yaw_forearm roll_forearm ... classe
corr_c <- corr_funl_cl %>% correlate(target = classe__C)
corr_c %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
corr_c_sub <- head(corr_c %>% mutate(corr = abs(correlation)) %>%
arrange(desc(corr)) %>% select(feature) %>% unique(),20)
corr_c_sub$feature[which(corr_c_sub$feature %in% x_tr_corr_un)]
## [1] accel_dumbbell_x pitch_dumbbell magnet_arm_z accel_dumbbell_z
## [5] pitch_belt total_accel_belt yaw_dumbbell
## 53 Levels: gyros_dumbbell_z accel_belt_z gyros_arm_z ... classe
corr_d <- corr_funl_cl %>% correlate(target = classe__D)
corr_d %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
corr_d_sub <- head(corr_d %>% mutate(corr = abs(correlation)) %>%
arrange(desc(corr)) %>% select(feature) %>% unique(),20)
corr_d_sub$feature[which(corr_d_sub$feature %in% x_tr_corr_un)]
## [1] magnet_arm_y magnet_arm_x accel_arm_x pitch_dumbbell roll_belt
## 53 Levels: gyros_forearm_y gyros_belt_x gyros_arm_x ... classe
corr_e <- corr_funl_cl %>% correlate(target = classe__E)
corr_e %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
corr_e_sub <- head(corr_e %>% mutate(corr = abs(correlation)) %>%
arrange(desc(corr)) %>% select(feature) %>% unique(),20)
corr_e_sub$feature[which(corr_e_sub$feature %in% x_tr_corr_un)]
## [1] roll_belt yaw_belt total_accel_belt accel_belt_z
## [5] magnet_arm_x magnet_arm_y magnet_arm_z accel_arm_x
## 53 Levels: magnet_forearm_x accel_forearm_x yaw_forearm ... classe
DTM<- train(classe ~. , data=x_cvtrain, method= "rpart")
fancyRpartPlot(DTM$finalModel)
set.seed(200)
DTM_prediction<- predict(DTM, x_test)
confusionMatrix(DTM_prediction, as.factor(y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1776 535 585 508 197
## B 32 334 19 185 74
## C 141 459 593 432 425
## D 0 0 0 0 0
## E 4 0 0 0 566
##
## Overall Statistics
##
## Accuracy : 0.4762
## 95% CI : (0.4643, 0.4881)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3155
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9094 0.25151 0.49541 0.0000 0.44849
## Specificity 0.6285 0.94401 0.74294 1.0000 0.99929
## Pos Pred Value 0.4932 0.51863 0.28927 NaN 0.99298
## Neg Pred Value 0.9458 0.84022 0.87456 0.8361 0.88944
## Prevalence 0.2845 0.19345 0.17436 0.1639 0.18383
## Detection Rate 0.2587 0.04865 0.08638 0.0000 0.08245
## Detection Prevalence 0.5245 0.09381 0.29862 0.0000 0.08303
## Balanced Accuracy 0.7689 0.59776 0.61917 0.5000 0.72389
RF <- train(classe ~. , data=x_cvtrain, method= "rf", ntree=100)
set.seed(200)
RF_prediction<- predict(RF, x_test)
RF_CM <- confusionMatrix(RF_prediction, as.factor(y_test))
RF_CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1950 12 0 0 0
## B 3 1313 11 0 0
## C 0 3 1186 31 0
## D 0 0 0 1093 2
## E 0 0 0 1 1260
##
## Overall Statistics
##
## Accuracy : 0.9908
## 95% CI : (0.9883, 0.9929)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9884
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9985 0.9887 0.9908 0.9716 0.9984
## Specificity 0.9976 0.9975 0.9940 0.9997 0.9998
## Pos Pred Value 0.9939 0.9894 0.9721 0.9982 0.9992
## Neg Pred Value 0.9994 0.9973 0.9981 0.9945 0.9996
## Prevalence 0.2845 0.1934 0.1744 0.1639 0.1838
## Detection Rate 0.2840 0.1913 0.1728 0.1592 0.1835
## Detection Prevalence 0.2858 0.1933 0.1777 0.1595 0.1837
## Balanced Accuracy 0.9980 0.9931 0.9924 0.9856 0.9991
plot(RF_CM$table, col=RF_CM$byClass, main="Random Forest Accuracy")
GBMM <- train(classe~., data=x_cvtrain, method="gbm", verbose= FALSE)
set.seed(200)
GBMM_prediction<- predict(GBMM, x_test)
confusionMatrix(GBMM_prediction, as.factor(y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1916 44 0 1 2
## B 23 1241 41 2 11
## C 10 41 1134 39 8
## D 4 2 19 1071 18
## E 0 0 3 12 1223
##
## Overall Statistics
##
## Accuracy : 0.9592
## 95% CI : (0.9543, 0.9638)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9484
##
## Mcnemar's Test P-Value : 8.302e-06
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9811 0.9345 0.9474 0.9520 0.9691
## Specificity 0.9904 0.9861 0.9827 0.9925 0.9973
## Pos Pred Value 0.9761 0.9416 0.9205 0.9614 0.9879
## Neg Pred Value 0.9925 0.9843 0.9888 0.9906 0.9931
## Prevalence 0.2845 0.1934 0.1744 0.1639 0.1838
## Detection Rate 0.2791 0.1808 0.1652 0.1560 0.1782
## Detection Prevalence 0.2859 0.1920 0.1795 0.1623 0.1803
## Balanced Accuracy 0.9857 0.9603 0.9650 0.9723 0.9832
result <- data.frame("problem_id" = X_test$problem_id,
"PREDICTION_DTM" = predict(DTM,X_test[,1:52]),
"PREDICTION_RF" = predict(RF,X_test[,1:52]),
"PREDICTION_GBM" = predict(GBMM,X_test[,1:52]))
result